import json
from glob import glob
from tqdm import tqdm

base_path = "D:/datas/wiki_zh_2019/wiki_zh/**/*/*"

file_list = glob(base_path, recursive=True)

for file in tqdm(file_list, desc="loading"):
    # print(file)
    results = []
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            if not line or not line.strip():
                continue
            try:
                one_line = json.loads(line)
            except Exception as e:
                print(line)
            text = one_line.get("text")
            if not text:
                continue
            text = "".join(text.split())
            results.append(text)

    if results:
        with open("./source.txt", "a", encoding="utf-8") as f:
            for text in results:
                f.write(text + "\n")